Like SQL “case when” statement and Swith statement from popular programming languages, Spark SQL Dataframe also supports similar syntax using “when otherwise” or we can also use “case when” statement. So let’s see an example on how to check for multiple conditions and replicate SQL CASE statement.
Create DataFrameval empDF = spark.createDataFrame(Seq(
(7369, "SMITH", "CLERK", 7902, "17-Dec-80", 800, 20, 10),
(7499, "ALLEN", "SALESMAN", 7698, "20-Feb-81", 1600, 300, 30),
(7521, "WARD", "SALESMAN", 7698, "22-Feb-81", 1250, 500, 30),
(7566, "JONES", "MANAGER", 7839, "2-Apr-81", 2975, 0, 20),
(7654, "MARTIN", "SALESMAN", 7698, "28-Sep-81", 1250, 1400, 30),
(7698, "BLAKE", "MANAGER", 7839, "1-May-81", 2850, 0, 30),
(7782, "CLARK", "MANAGER", 7839, "9-Jun-81", 2450, 0, 10),
(7788, "SCOTT", "ANALYST", 7566, "19-Apr-87", 3000, 0, 20),
(7839, "KING", "PRESIDENT", 0, "17-Nov-81", 5000, 0, 10),
(7844, "TURNER", "SALESMAN", 7698, "8-Sep-81", 1500, 0, 30),
(7876, "ADAMS", "CLERK", 7788, "23-May-87", 1100, 0, 20)
)).toDF("empno", "ename", "job", "mgr", "hiredate", "sal", "comm", "deptno")
empDF.show
Use of case in DataFrame(when otherwise)
import org.apache.spark.sql.functions.{when, _}
val df=empDF.withColumn("Job_CD", when(col("job")
==="MANAGER","MR")
.when(col("job") ==="CLERK","CL")
.when(col("job")===
"ANALYST","ANA")
.when(col("job")=== "PRESIDENT","PRE")
.when(col("job")
=== "SALESMAN","SALE")
.otherwise("Unknown")).show
Use of case in DataFrame(when otherwise)
val df=empDF.withColumn("job_CD", expr(" case when job = 'MANAGER' then 'MR' " + " when job = 'CLERK' then 'CL' " + " when job = 'ANALYST' then 'ANA' " + " when job = 'PRESIDENT' then 'PRE' " + " when job = 'SALESMAN' then 'SALE' " + " else 'UNKOWN' end "). alias("job_CD")).show
val df=empDF.select (col("*"), expr(" CASE WHEN job = 'MANAGER' THEN 'MR' " + " WHEN job = 'CLERK' THEN 'CL' " + " WHEN job = 'ANALYST' THEN 'ANA' " + " WHEN job = 'PRESIDENT' THEN 'PRE' " + " WHEN job = 'SALESMAN' THEN 'SALE' " + " ELSE 'UNKOWN' END "). alias("JOB_CD"))
Use of case in DataFrame (using && and || operators)
import org.apache.spark.sql.functions._
val df=empDF.withColumn("Job_CD", when(col("job") === "MANAGER" && col("deptno") === 10 ,"MR" )
.when(col("job") ==="CLERK" && col("deptno") === 10 ,"CL" )
.when(col("job") === "ANALYST" && col("deptno") === 20 ,"ANA")
.when(col("job")=== "PRESIDENT" || col("deptno") === 30 ,"PRE")
.when(col("job") === "SALESMAN" && col("deptno") === 10 ,"SALE")
.otherwise("Unknown")).show
df_pres.select
$"pres_name",
$"pres_dob",
$"pres_bs",
when($"pres_bs"==="Virginia","VA")
.when($"pres_bs"==="Massachusetts","MA")
.when($"pres_bs"==="Ohio","OH")
.otherwise("Others").alias("state_abbr"),
when($"pres_dob".between("1701-01-01","1800-12-31"),"18th Century")
.when($"pres_dob".between("1801-01-01","1900-12-31"),"19th Century")
.when($"pres_dob".between("1901-01-01","2000-12-31"),"20th Century")
.alias("Century"),lit("-1").alias("C3")).show(50)
No comments:
Post a Comment